{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "78eaca77",
   "metadata": {},
   "outputs": [],
   "source": [
    "import soundfile as sf\n",
    "import subprocess\n",
    "from glob import glob\n",
    "import os\n",
    "from multiprocess import Pool\n",
    "import itertools\n",
    "from tqdm import tqdm\n",
    "\n",
    "def chunks(l, n):\n",
    "    for i in range(0, len(l), n):\n",
    "        yield (l[i: i + n], i // n)\n",
    "\n",
    "\n",
    "def multiprocessing(strings, function, cores=6, returned=True):\n",
    "    df_split = chunks(strings, len(strings) // cores)\n",
    "    pool = Pool(cores)\n",
    "    pooled = pool.map(function, df_split)\n",
    "    pool.close()\n",
    "    pool.join()\n",
    "\n",
    "    if returned:\n",
    "        return list(itertools.chain(*pooled))\n",
    "    \n",
    "def convert_mp4_to_mp3(input_file, output_file, sample_rate=16000):\n",
    "    \"\"\"\n",
    "    # Example usage\n",
    "    # convert_mp4_to_mp3('input.mp4', 'output.mp3')\n",
    "    \"\"\"\n",
    "    cmd = [\n",
    "        'ffmpeg',\n",
    "        '-i', input_file,\n",
    "        '-vn',\n",
    "        '-ar', str(sample_rate),\n",
    "        '-ac', '1',\n",
    "        '-b:a', '48k',\n",
    "        output_file\n",
    "    ]\n",
    "    subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)\n",
    "\n",
    "\n",
    "def loop(files):\n",
    "    files, _ = files\n",
    "    for f in tqdm(files):\n",
    "        try:\n",
    "            new_f = os.path.join('vggsound-mp3', os.path.split(f)[1].replace('.mp4', '.mp3'))\n",
    "            convert_mp4_to_mp3(f, new_f)\n",
    "        except:\n",
    "            pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "053b59bd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# from huggingface_hub import snapshot_download\n",
    "\n",
    "# snapshot_download(\n",
    "#     repo_id=\"Loie/VGGSound\",\n",
    "#     repo_type=\"dataset\",\n",
    "#     allow_patterns=[\n",
    "#         '*.tar.gz', \n",
    "#     ],\n",
    "#     local_dir = './VGGSound'\n",
    "# )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "36760f61",
   "metadata": {},
   "outputs": [],
   "source": [
    "# from glob import glob\n",
    "# import os\n",
    "\n",
    "# files = glob('VGGSound/*.tar.gz')\n",
    "# for f in files:\n",
    "#     print(f)\n",
    "#     os.system(f'tar -zxf {f}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ac084414",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !rm -rf vggsound-mp3\n",
    "# !mkdir vggsound-mp3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "2fd97bb7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "197957"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# files = glob('scratch/shared/beegfs/hchen/train_data/VGGSound_final/video/*.mp4')\n",
    "# len(files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "d437a520",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 33%|███████████████████████████▏                                                       | 1296/3959 [13:29<28:16,  1.57it/s]IOPub message rate exceeded.\n",
      "The notebook server will temporarily stop sending output\n",
      "to the client in order to avoid crashing it.\n",
      "To change this limit, set the config variable\n",
      "`--NotebookApp.iopub_msg_rate_limit`.\n",
      "\n",
      "Current values:\n",
      "NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
      "NotebookApp.rate_limit_window=3.0 (secs)\n",
      "\n",
      " 74%|█████████████████████████████████████████████████████████████▎                     | 2922/3959 [30:31<12:57,  1.33it/s]"
     ]
    }
   ],
   "source": [
    "# multiprocessing(files, loop, cores = 50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "538faa36",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "12G\tvggsound-mp3\r\n"
     ]
    }
   ],
   "source": [
    "!du -hs vggsound-mp3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "7d1daa0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://huggingface.co/datasets/Loie/VGGSound/resolve/main/vggsound.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "1b076227",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(199467, 4)"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv('vggsound.csv', header = None)\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "9343bdde",
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_labels = str(df[2].unique().tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "57a43e95",
   "metadata": {},
   "outputs": [],
   "source": [
    "questions = [\n",
    "    'given the labels\\n{labels}\\n\\nwhat is the label for the audio',\n",
    "    'what is the label for audio\\n\\nthe labels: {labels}'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "ea0caa58",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████| 199467/199467 [00:16<00:00, 12388.96it/s]\n"
     ]
    }
   ],
   "source": [
    "from collections import defaultdict\n",
    "import random\n",
    "import json\n",
    "\n",
    "vggsound = defaultdict(list)\n",
    "\n",
    "for i in tqdm(range(len(df))):\n",
    "    audio_filename = os.path.join('vggsound-mp3', f\"{df[0].iloc[i]}_{int(df[1].iloc[i]):06d}.mp3\")\n",
    "    if not os.path.exists(audio_filename):\n",
    "        continue\n",
    "    split = df[3].iloc[i]\n",
    "    d = {\n",
    "        'question': random.choice(questions).format(labels = unique_labels),\n",
    "        'answer': df[2].iloc[i],\n",
    "        'audio_filename': audio_filename,\n",
    "        'metadata': json.dumps(df.iloc[i].to_dict())\n",
    "    }\n",
    "    vggsound[split].append(d)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "8b5be7fd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'question': \"what is the label for audio\\n\\nthe labels: ['people marching', 'waterfall burbling', 'playing tennis', 'people belly laughing', 'car engine starting', 'alarm clock ringing', 'female speech, woman speaking', 'cricket chirping', 'wind noise', 'foghorn', 'people battle cry', 'playing volleyball', 'female singing', 'playing harpsichord', 'male speech, man speaking', 'playing bassoon', 'playing piano', 'people clapping', 'bee, wasp, etc. buzzing', 'baby babbling', 'people whispering', 'coyote howling', 'metronome', 'playing harp', 'airplane', 'rope skipping', 'ambulance siren', 'people coughing', 'pheasant crowing', 'bird wings flapping', 'cap gun shooting', 'child singing', 'race car, auto racing', 'male singing', 'playing bass guitar', 'playing violin, fiddle', 'playing bongo', 'playing erhu', 'tap dancing', 'playing electronic organ', 'playing congas', 'subway, metro, underground', 'car passing by', 'orchestra', 'playing acoustic guitar', 'hammering nails', 'duck quacking', 'playing harmonica', 'dog growling', 'church bell ringing', 'playing table tennis', 'skateboarding', 'playing steel guitar, slide guitar', 'playing squash', 'playing bass drum', 'police radio chatter', 'horse clip-clop', 'horse neighing', 'playing double bass', 'playing cello', 'people booing', 'playing electric guitar', 'raining', 'gibbon howling', 'toilet flushing', 'missile launch', 'driving buses', 'motorboat, speedboat acceleration', 'playing didgeridoo', 'playing glockenspiel', 'skiing', 'playing oboe', 'arc welding', 'sheep bleating', 'chimpanzee pant-hooting', 'playing cornet', 'lighting firecrackers', 'bowling impact', 'cow lowing', 'planing timber', 'people finger snapping', 'pigeon, dove cooing', 'fire crackling', 'playing tabla', 'cupboard opening or closing', 'crow cawing', 'vehicle horn, car horn, honking', 'playing washboard', 'fire truck siren', 'pumping water', 'cat growling', 'beat boxing', 'playing bagpipes', 'playing trumpet', 'woodpecker pecking tree', 'playing banjo', 'people crowd', 'chicken crowing', 'rowboat, canoe, kayak rowing', 'forging swords', 'footsteps on snow', 'driving motorcycle', 'owl hooting', 'mosquito buzzing', 'air horn', 'child speech, kid speaking', 'slot machine', 'lions growling', 'playing flute', 'playing synthesizer', 'railroad car, train wagon', 'car engine knocking', 'playing accordion', 'sharpen knife', 'playing drum kit', 'playing trombone', 'train whistling', 'reversing beeps', 'singing choir', 'chainsawing trees', 'printer printing', 'chicken clucking', 'underwater bubbling', 'people nose blowing', 'ice cracking', 'playing lacrosse', 'playing timpani', 'tapping guitar', 'turkey gobbling', 'door slamming', 'playing saxophone', 'playing french horn', 'sea waves', 'playing clarinet', 'using sewing machines', 'vacuum cleaner cleaning floors', 'playing castanets', 'pig oinking', 'lawn mowing', 'basketball bounce', 'eletric blender running', 'thunder', 'dog whimpering', 'car engine idling', 'people sobbing', 'playing theremin', 'fox barking', 'playing ukulele', 'parrot talking', 'playing cymbal', 'people shuffling', 'people eating noodle', 'playing steelpan', 'people farting', 'running electric fan', 'mouse clicking', 'ice cream truck, ice cream van', 'chopping food', 'lathe spinning', 'playing tympani', 'canary calling', 'helicopter', 'playing sitar', 'people screaming', 'baby crying', 'cell phone buzzing', 'train horning', 'donkey, ass braying', 'striking pool', 'cat meowing', 'tractor digging', 'playing hammond organ', 'mouse pattering', 'people hiccup', 'air conditioning noise', 'playing snare drum', 'penguins braying', 'playing hockey', 'wood thrush calling', 'stream burbling', 'playing marimba, xylophone', 'baby laughter', 'playing tambourine', 'singing bowl', 'engine accelerating, revving, vroom', 'yodelling', 'machine gun shooting', 'electric grinder grinding', 'swimming', 'electric shaver, electric razor shaving', 'people babbling', 'police car (siren)', 'opening or closing car doors', 'fireworks banging', 'people giggling', 'playing djembe', 'baltimore oriole calling', 'sliding door', 'typing on computer keyboard', 'people running', 'hail', 'driving snowmobile', 'sailing', 'civil defense siren', 'plastic bottle crushing', 'elephant trumpeting', 'goose honking', 'eating with cutlery', 'dog bow-wow', 'cattle, bovinae cowbell', 'people eating crisps', 'skidding', 'dog barking', 'squishing water', 'roller coaster running', 'smoke detector beeping', 'ocean burbling', 'airplane flyby', 'firing cannon', 'ripping paper', 'alligators, crocodiles hissing', 'cuckoo bird calling', 'people cheering', 'people sniggering', 'playing badminton', 'frog croaking', 'people burping', 'golf driving', 'people sneezing', 'mynah bird singing', 'splashing water', 'cattle mooing', 'rapping', 'cat purring', 'magpie calling', 'strike lighter', 'people eating', 'francolin calling', 'bird squawking', 'playing vibraphone', 'playing mandolin', 'lip smacking', 'zebra braying', 'disc scratching', 'popping popcorn', 'cat caterwauling', 'ferret dooking', 'wind rustling leaves', 'elk bugling', 'barn swallow calling', 'volcano explosion', 'heart sounds, heartbeat', 'firing muskets', 'mouse squeaking', 'playing timbales', 'people slapping', 'telephone bell ringing', 'playing shofar', 'opening or closing drawers', 'chinchilla barking', 'people whistling', 'spraying water', 'playing tuning fork', 'train wheels squealing', 'bird chirping, tweeting', 'tornado roaring', 'writing on blackboard with chalk', 'extending ladders', 'scuba diving', 'typing on typewriter', 'dog howling', 'people slurping', 'snake rattling', 'wind chime', 'cheetah chirrup', 'children shouting', 'playing gong', 'people humming', 'snake hissing', 'cat hissing', 'people gargling', 'blowtorch igniting', 'playing bugle', 'dinosaurs bellowing', 'chipmunk chirping', 'playing zither', 'playing darts', 'chopping wood', 'shot football', 'sea lion barking', 'dog baying', 'sloshing water', 'black capped chickadee calling', 'lions roaring', 'playing guiro', 'striking bowling', 'bouncing on trampoline', 'eagle screaming', 'hedge trimmer running', 'bathroom ventilation fan running', 'hair dryer drying', 'cutting hair with electric trimmers', 'people eating apple', 'fly, housefly buzzing', 'bull bellowing', 'otter growling', 'opening or closing car electric windows', 'warbler chirping', 'goat bleating', 'whale calling']\",\n",
       " 'answer': 'fire crackling',\n",
       " 'audio_filename': 'vggsound-mp3/-0yRK50zyTI_000030.mp3',\n",
       " 'metadata': '{\"0\": \"-0yRK50zyTI\", \"1\": 30, \"2\": \"fire crackling\", \"3\": \"test\"}'}"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vggsound['test'][10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "61706bbe",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import Dataset\n",
    "\n",
    "dataset = Dataset.from_list(vggsound['train'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "a14ee6ba",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "855a7c1aa972401a836e5f9ad96c77fd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "689e2401ad6c4d1cbb98dae108702dc6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/61 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bdbf5e8a9d22471cb0fdc3836ce952b9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading...:   0%|          | 0.00/2.99M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "42ff75db751e4b95ae99cf9032b926bf",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/61 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f4713ed42c8d48e3bd5e0b6f6ace5a4d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading...:   0%|          | 0.00/2.99M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bc652cde5ed046379616ff88e9aabdd3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/61 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7b6fa5666b9442dba6d510b400635bed",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading...:   0%|          | 0.00/2.99M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b883536916cd492298d34b415c3a0ea2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "README.md:   0%|          | 0.00/943 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/commit/6c1456f8f682ed49d8fb1bc55f21e54f895dd21e', commit_message='Upload dataset', commit_description='', oid='6c1456f8f682ed49d8fb1bc55f21e54f895dd21e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Zeroshot-Audio-Classification-Instructions'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.push_to_hub('mesolitica/Zeroshot-Audio-Classification-Instructions', split = 'vggsound_train')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "99b7865d",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "dd08447abf80424d8753f10cc8d0678a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4538d13885ed4cedb06d5e167bd56ca0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f05b1d57f0b24334905638f15d1980a0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading...:   0%|          | 0.00/839k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8b7a85558d2d4ef299d6a2208f637f67",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "README.md:   0%|          | 0.00/1.36k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/commit/9c1a4c880b7a1bf885fba8377ed9f6707f7d1526', commit_message='Upload dataset', commit_description='', oid='9c1a4c880b7a1bf885fba8377ed9f6707f7d1526', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Zeroshot-Audio-Classification-Instructions'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = Dataset.from_list(vggsound['test'])\n",
    "dataset.push_to_hub('mesolitica/Zeroshot-Audio-Classification-Instructions', split = 'vggsound_test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "332f1036",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "197956"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "audio_files = glob('vggsound-mp3/*.mp3')\n",
    "len(audio_files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "99d62d18",
   "metadata": {},
   "outputs": [],
   "source": [
    "import zipfile\n",
    "with zipfile.ZipFile('vggsound.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:\n",
    "    for f in audio_files:\n",
    "        zipf.write(f, arcname=f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "46952330",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Uploading files using Xet Storage..\n",
      "Uploading...: 100%|███████████████████████▉| 11.7G/11.7G [02:41<00:00, 72.3MB/s]\n",
      "https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/blob/main/vggsound.zip\n"
     ]
    }
   ],
   "source": [
    "!huggingface-cli upload mesolitica/Zeroshot-Audio-Classification-Instructions vggsound.zip \\\n",
    "--repo-type=dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "786857a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm vggsound.zip"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
